import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.path import Path
from matplotlib.patches import PathPatch
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
car_deho_india = pd.read_csv('car_deho_india_original.csv')
car = car_deho_india.dropna()
car
name | year | selling_price | km_driven | fuel | seller_type | transmission | owner | |
---|---|---|---|---|---|---|---|---|
0 | Maruti 800 AC | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner |
1 | Maruti Wagon R LXI Minor | 2007 | 135000 | 50000 | Petrol | Individual | Manual | First Owner |
2 | Hyundai Verna 1.6 SX | 2012 | 600000 | 100000 | Diesel | Individual | Manual | First Owner |
3 | Datsun RediGO T Option | 2017 | 250000 | 46000 | Petrol | Individual | Manual | First Owner |
4 | Honda Amaze VX i-DTEC | 2014 | 450000 | 141000 | Diesel | Individual | Manual | Second Owner |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4335 | Hyundai i20 Magna 1.4 CRDi (Diesel) | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner |
4336 | Hyundai i20 Magna 1.4 CRDi | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner |
4337 | Maruti 800 AC BSIII | 2009 | 110000 | 83000 | Petrol | Individual | Manual | Second Owner |
4338 | Hyundai Creta 1.6 CRDi SX Option | 2016 | 865000 | 90000 | Diesel | Individual | Manual | First Owner |
4339 | Renault KWID RXT | 2016 | 225000 | 40000 | Petrol | Individual | Manual | First Owner |
4340 rows × 8 columns
car.nunique()
name 1491 year 27 selling_price 445 km_driven 770 fuel 5 seller_type 3 transmission 2 owner 5 make 29 dtype: int64
car['make'] = car['name'].str.split(' ').str[0]
car
name | year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | |
---|---|---|---|---|---|---|---|---|---|
0 | Maruti 800 AC | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti |
1 | Maruti Wagon R LXI Minor | 2007 | 135000 | 50000 | Petrol | Individual | Manual | First Owner | Maruti |
2 | Hyundai Verna 1.6 SX | 2012 | 600000 | 100000 | Diesel | Individual | Manual | First Owner | Hyundai |
3 | Datsun RediGO T Option | 2017 | 250000 | 46000 | Petrol | Individual | Manual | First Owner | Datsun |
4 | Honda Amaze VX i-DTEC | 2014 | 450000 | 141000 | Diesel | Individual | Manual | Second Owner | Honda |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4335 | Hyundai i20 Magna 1.4 CRDi (Diesel) | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai |
4336 | Hyundai i20 Magna 1.4 CRDi | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai |
4337 | Maruti 800 AC BSIII | 2009 | 110000 | 83000 | Petrol | Individual | Manual | Second Owner | Maruti |
4338 | Hyundai Creta 1.6 CRDi SX Option | 2016 | 865000 | 90000 | Diesel | Individual | Manual | First Owner | Hyundai |
4339 | Renault KWID RXT | 2016 | 225000 | 40000 | Petrol | Individual | Manual | First Owner | Renault |
4340 rows × 9 columns
car.groupby('name')['name'].count()
name Ambassador CLASSIC 1500 DSL AC 2 Ambassador Classic 2000 Dsz 1 Ambassador Grand 1800 ISZ MPFI PW CL 1 Audi A4 1.8 TFSI 1 Audi A4 2.0 TDI 1 .. Volkswagen Vento Petrol Highline AT 2 Volvo V40 D3 R Design 1 Volvo XC 90 D5 Inscription BSIV 1 Volvo XC60 D3 Kinetic 1 Volvo XC60 D5 Inscription 1 Name: name, Length: 1491, dtype: int64
car.drop('name', axis=1, inplace=True)
car
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | |
---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti |
1 | 2007 | 135000 | 50000 | Petrol | Individual | Manual | First Owner | Maruti |
2 | 2012 | 600000 | 100000 | Diesel | Individual | Manual | First Owner | Hyundai |
3 | 2017 | 250000 | 46000 | Petrol | Individual | Manual | First Owner | Datsun |
4 | 2014 | 450000 | 141000 | Diesel | Individual | Manual | Second Owner | Honda |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4335 | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai |
4336 | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai |
4337 | 2009 | 110000 | 83000 | Petrol | Individual | Manual | Second Owner | Maruti |
4338 | 2016 | 865000 | 90000 | Diesel | Individual | Manual | First Owner | Hyundai |
4339 | 2016 | 225000 | 40000 | Petrol | Individual | Manual | First Owner | Renault |
4340 rows × 8 columns
car['price_EUR'] = (car['selling_price'] * 0.012).round(0)
car['price_EUR']
0 720.0 1 1620.0 2 7200.0 3 3000.0 4 5400.0 ... 4335 4920.0 4336 4920.0 4337 1320.0 4338 10380.0 4339 2700.0 Name: price_EUR, Length: 4340, dtype: float64
car['price_EUR'].sort_values()
2662 240.0 2495 264.0 2444 360.0 3206 420.0 2849 420.0 ... 1023 59400.0 2239 59400.0 3969 66000.0 89 97800.0 3872 106800.0 Name: price_EUR, Length: 4340, dtype: float64
def if_salygos(x):
if x < 1000:
return "Iki 1000"
elif x >= 1000 and x < 2000:
return "Iki 2000"
elif x >= 2000 and x < 3000:
return "Iki 3000"
elif x >= 3000 and x < 4000:
return "Iki 4000"
elif x >= 4000 and x < 5000:
return "Iki 5000"
elif x >= 5000 and x < 7000:
return "Iki 7000"
elif x >= 7000 and x < 10000:
return "Iki 10000"
elif x >= 10000 and x < 20000:
return "Iki 20000"
else:
return "Nuo 20000"
reiksmes_vektorizuoti = np.vectorize(if_salygos)
sugrupuoti = reiksmes_vektorizuoti(car['price_EUR'])
car['price_group'] = sugrupuoti
car
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 |
1 | 2007 | 135000 | 50000 | Petrol | Individual | Manual | First Owner | Maruti | 1620.0 | Iki 2000 |
2 | 2012 | 600000 | 100000 | Diesel | Individual | Manual | First Owner | Hyundai | 7200.0 | Iki 10000 |
3 | 2017 | 250000 | 46000 | Petrol | Individual | Manual | First Owner | Datsun | 3000.0 | Iki 4000 |
4 | 2014 | 450000 | 141000 | Diesel | Individual | Manual | Second Owner | Honda | 5400.0 | Iki 7000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4335 | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai | 4920.0 | Iki 5000 |
4336 | 2014 | 409999 | 80000 | Diesel | Individual | Manual | Second Owner | Hyundai | 4920.0 | Iki 5000 |
4337 | 2009 | 110000 | 83000 | Petrol | Individual | Manual | Second Owner | Maruti | 1320.0 | Iki 2000 |
4338 | 2016 | 865000 | 90000 | Diesel | Individual | Manual | First Owner | Hyundai | 10380.0 | Iki 20000 |
4339 | 2016 | 225000 | 40000 | Petrol | Individual | Manual | First Owner | Renault | 2700.0 | Iki 3000 |
4340 rows × 10 columns
car.groupby('price_group').count()
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | |
---|---|---|---|---|---|---|---|---|---|
price_group | |||||||||
Iki 1000 | 236 | 236 | 236 | 236 | 236 | 236 | 236 | 236 | 236 |
Iki 10000 | 656 | 656 | 656 | 656 | 656 | 656 | 656 | 656 | 656 |
Iki 2000 | 600 | 600 | 600 | 600 | 600 | 600 | 600 | 600 | 600 |
Iki 20000 | 388 | 388 | 388 | 388 | 388 | 388 | 388 | 388 | 388 |
Iki 3000 | 532 | 532 | 532 | 532 | 532 | 532 | 532 | 532 | 532 |
Iki 4000 | 652 | 652 | 652 | 652 | 652 | 652 | 652 | 652 | 652 |
Iki 5000 | 431 | 431 | 431 | 431 | 431 | 431 | 431 | 431 | 431 |
Iki 7000 | 706 | 706 | 706 | 706 | 706 | 706 | 706 | 706 | 706 |
Nuo 20000 | 139 | 139 | 139 | 139 | 139 | 139 | 139 | 139 | 139 |
Daugiau kaip 50 K km nuvažiavę bei turintys "mechaninę" pavarų dėžę automobiliai yra pigesni.
(Nepasitvirtino)
car.head(1)
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 |
car[car['km_driven'] >= 50000].groupby(['price_group', 'transmission'])['price_EUR'].count()
price_group transmission Iki 1000 Automatic 1 Manual 192 Iki 10000 Automatic 22 Manual 282 Iki 2000 Automatic 11 Manual 514 Iki 20000 Automatic 62 Manual 105 Iki 3000 Automatic 9 Manual 376 Iki 4000 Automatic 13 Manual 408 Iki 5000 Automatic 13 Manual 263 Iki 7000 Automatic 16 Manual 368 Nuo 20000 Automatic 37 Manual 5 Name: price_EUR, dtype: int64
Individualūs savininkai automobilius parduoda pigiau (mažiau reklamos, nesuteikia garantijos, sensni, patikimumas ir kt.).
(Visiškai pasitvirtino)
car.head(1)
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | km_group | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 | Iki 100000 km |
print(car.groupby(['seller_type'])['price_EUR'].mean().round(0))
seller_type Dealer 8662.0 Individual 5094.0 Trustmark Dealer 10979.0 Name: price_EUR, dtype: float64
Norint įsigyti automobilį nuvažiavusį ne daugiau 50 K km ir ne senesnį kaip 3 metų amžiaus, dažniausiai reikia "pasiruošti" bent 10 k EUR.
(Dažniausiai taip)
car.head(1)
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | km_group | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 | Iki 100000 km |
car[car['km_driven'] <= 50000].groupby(['year'])['price_EUR'].mean().round(0)
year 1996 3000.0 1998 5112.0 1999 1100.0 2000 891.0 2001 2314.0 2002 1245.0 2003 679.0 2004 1410.0 2005 1107.0 2006 4105.0 2007 1832.0 2008 2108.0 2009 2953.0 2010 3446.0 2011 3023.0 2012 4097.0 2013 5483.0 2014 5963.0 2015 6242.0 2016 6534.0 2017 9195.0 2018 10928.0 2019 13021.0 2020 10043.0 Name: price_EUR, dtype: float64
car.plot.bar(x='price_group', y='km_driven', title='Hipotezė 3')
<AxesSubplot:title={'center':'Hipotezė 3'}, xlabel='price_group'>
Jei pirkėjas turi iki 3 k EUR, reikėtų tikėtis automobilio su "mechanine" pavarų dėže ir nuvažiavusio bent 100 k km.
(Visiškai nepasitvirtino)
car.head(1)
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | km_group | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 | Iki 100000 km |
def if_km(x):
if x <= 100000:
return "Iki 100000 km"
else:
return "nuo 100000 km"
reiksmes_vektorizuoti = np.vectorize(if_km)
sugrupuoti = reiksmes_vektorizuoti(car['km_driven'])
car['km_group'] = sugrupuoti
car.head(6)
year | selling_price | km_driven | fuel | seller_type | transmission | owner | make | price_EUR | price_group | km_group | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 60000 | 70000 | Petrol | Individual | Manual | First Owner | Maruti | 720.0 | Iki 1000 | Iki 100000 km |
1 | 2007 | 135000 | 50000 | Petrol | Individual | Manual | First Owner | Maruti | 1620.0 | Iki 2000 | Iki 100000 km |
2 | 2012 | 600000 | 100000 | Diesel | Individual | Manual | First Owner | Hyundai | 7200.0 | Iki 10000 | Iki 100000 km |
3 | 2017 | 250000 | 46000 | Petrol | Individual | Manual | First Owner | Datsun | 3000.0 | Iki 4000 | Iki 100000 km |
4 | 2014 | 450000 | 141000 | Diesel | Individual | Manual | Second Owner | Honda | 5400.0 | Iki 7000 | nuo 100000 km |
5 | 2007 | 140000 | 125000 | Petrol | Individual | Manual | First Owner | Maruti | 1680.0 | Iki 2000 | nuo 100000 km |
car[(car['price_EUR'] < 3000)].groupby(['km_group', 'transmission'])['price_EUR'].mean().round(0)
km_group transmission Iki 100000 km Automatic 2020.0 Manual 1786.0 nuo 100000 km Automatic 1824.0 Manual 1691.0 Name: price_EUR, dtype: float64
Vertinant automobilio ridą, kokia turėtų būti jo kaina.
car.plot.scatter(x='km_driven', y='price_EUR')
<AxesSubplot:xlabel='km_driven', ylabel='price_EUR'>
sns.jointplot(x='km_driven', y='price_EUR', data=car)
sns.displot(car['price_EUR'])
<seaborn.axisgrid.FacetGrid at 0x1b1f3b51490>
x = car[['km_driven']]
y = car['price_EUR']
X = x
model = LinearRegression()
model.fit(x, y)
LinearRegression()
prediction = model.predict(x)
prediction
array([5941.22175259, 6513.63305302, 5082.60480194, ..., 5569.15440731, 5368.81045215, 6799.83870324])
model.predict([[10000]]).round(0)
array([7658.])
r_squared = model.score(x, y)
r_squared
0.036974943094581936
plt.scatter(x, y)
plt.plot(x, prediction, 'green')
[<matplotlib.lines.Line2D at 0x1b1f3962af0>]
car.to_csv('car_deho_india_original_for_Tableau_cool.csv')